In [14]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
import magic
import os
import nltk

openai_api_key = os.getenv("OPENAI_API_KEY", "YourAPIKey")

# nltk.download('averaged_perceptron_tagger')

# pip install unstructured
# Other dependencies to install https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/unstructured_file.html
# pip install python-magic-bin
# pip install chromadb

In [None]:
# Unzip data folder

import zipfile
with zipfile.ZipFile('../../data.zip', 'r') as zip_ref:
    zip_ref.extractall('..')

In [2]:
# Get your loader ready
loader = DirectoryLoader('../data/PaulGrahamEssaySmall/', glob='**/*.txt')

In [3]:
# Load up your text into documents
documents = loader.load()

In [4]:
# Get your text splitter ready
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

In [5]:
# Split your documents into texts
texts = text_splitter.split_documents(documents)

In [6]:
# Turn your texts into embeddings
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [7]:
# Get your docsearch ready
docsearch = FAISS.from_documents(texts, embeddings)

In [8]:
# Load up your LLM
llm = OpenAI(openai_api_key=openai_api_key)

In [9]:
# Create your Retriever
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

In [10]:
# Run a query
query = "What did McCarthy discover?"
qa.run(query)

' McCarthy discovered that a programming language could be constructed from a handful of simple operators and a notation for functions, using a data structure called a list for both code and data.'

### Sources

In [11]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                chain_type="stuff",
                                retriever=docsearch.as_retriever(),
                                return_source_documents=True)
query = "What did McCarthy discover?"
result = qa({"query": query})

In [12]:
result['result']

' McCarthy discovered a way to build a whole programming language using a handful of simple operators and a notation for functions. He called this language Lisp, for "List Processing," because one of his key ideas was to use a simple data structure called a list for both code and data.'

In [13]:
result['source_documents']

[Document(page_content='May 2001\n\n(I wrote this article to help myself understand exactly\n\nwhat McCarthy discovered.  You don\'t need to know this stuff\n\nto program in Lisp, but it should be helpful to\n\nanyone who wants to\n\nunderstand the essence of Lisp \x97 both in the sense of its\n\norigins and its semantic core.  The fact that it has such a core\n\nis one of Lisp\'s distinguishing features, and the reason why,\n\nunlike other languages, Lisp has dialects.)In 1960, John\n\nMcCarthy published a remarkable paper in\n\nwhich he did for programming something like what Euclid did for\n\ngeometry. He showed how, given a handful of simple\n\noperators and a notation for functions, you can\n\nbuild a whole programming language.\n\nHe called this language Lisp, for "List Processing,"\n\nbecause one of his key ideas was to use a simple\n\ndata structure called a list for both\n\ncode and data.It\'s worth understanding what McCarthy discovered, not\n\njust as a landmark in the histo